pacman::p_load(jsonlite,tidygraph,ggraph,visNetwork,graphlayouts,ggforce,skimr,
tidytext,tidyverse,ggstatsplot,ggiraph)Take-home Exercise 3
Importing JSON file by using jsonlite packages
mc3_data <- fromJSON("data/MC3.json")Extracting edges
mc3_edges <- as_tibble(mc3_data$links) %>%
distinct() %>%
mutate(source = as.character(source),
target = as.character(target),
type = as.character(type)) %>%
group_by(source, target, type) %>%
summarise(weights=n()) %>%
filter(source!=target) %>%
ungroup()Extracting nodes
mc3_nodes <- as_tibble(mc3_data$nodes) %>%
mutate(country = as.character(country),
id = as.character(id),
product_services = as.character(product_services),
revenue_omu = as.numeric(as.character(revenue_omu)),
type = as.character(type)) %>%
select(id, country, type, revenue_omu, product_services)write_rds(mc3_nodes, "data/mc3_nodes.rds")
write_rds(mc3_edges, "data/mc3_edges.rds")Edge table
Show the code
DT::datatable(mc3_edges)Plotting by type
Show the code
ggplot(data = mc3_edges,
aes(x = type)) +
geom_bar()
Node table
Show the code
DT::datatable(mc3_nodes)Plotting by type
Show the code
ggplot(data = mc3_nodes,
aes(x = type)) +
geom_bar()
Visualisation and Analysis
Building network model
id1 <- mc3_edges %>%
select(source) %>%
rename(id = source)
id2 <- mc3_edges %>%
select(target) %>%
rename(id = target)
mc3_nodes1 <- rbind(id1, id2) %>%
distinct() %>%
left_join(mc3_nodes,
unmatched = "drop")mc3_graph <- tbl_graph(nodes = mc3_nodes1,
edges = mc3_edges,
directed = FALSE) %>%
mutate(betweenness_centrality = centrality_betweenness(),
closeness_centrality = centrality_closeness())
mc3_graph %>%
filter(betweenness_centrality >= 100000) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha=0.5)) +
geom_node_point(aes(
size = betweenness_centrality,
colors = "lightblue",
alpha = 0.5)) +
scale_size_continuous(range=c(1,10))+
theme_graph()
Text Sensing
Simple word count
mc3_nodes %>%
mutate(n_fish = str_count(product_services, "fish")) # A tibble: 27,622 × 6
id country type revenue_omu product_services n_fish
<chr> <chr> <chr> <dbl> <chr> <int>
1 Jones LLC ZH Comp… 310612303. Automobiles 0
2 Coleman, Hall and Lopez ZH Comp… 162734684. Passenger cars,… 0
3 Aqua Advancements Sashimi … Oceanus Comp… 115004667. Holding firm wh… 0
4 Makumba Ltd. Liability Co Utopor… Comp… 90986413. Car service, ca… 0
5 Taylor, Taylor and Farrell ZH Comp… 81466667. Fully electric … 0
6 Harmon, Edwards and Bates ZH Comp… 75070435. Discount superm… 0
7 Punjab s Marine conservati… Riodel… Comp… 72167572. Beef, pork, chi… 0
8 Assam Limited Liability … Utopor… Comp… 72162317. Power and Gas s… 0
9 Ianira Starfish Sagl Import Rio Is… Comp… 68832979. Light commercia… 0
10 Moran, Lewis and Jimenez ZH Comp… 65592906. Automobiles, tr… 0
# ℹ 27,612 more rows
Tokenisation
token_nodes <- mc3_nodes %>%
unnest_tokens(word,
product_services)
token_nodes %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in product_services field")
Removing stopwords
select nodes with “seafood”,“fish”,“carp”,“catfish”,“herring”,“mackerel”,“pollock”,“salmon”,“shark”,“tuna” as part of the product service
stopwords_removed <- token_nodes %>%
anti_join(stop_words) %>%
filter(word %in% c("seafood","fish","carp","catfish","herring","mackerel","pollock","salmon","shark","tuna")) %>%
distinct()
stopwords_removed %>%
count(word, sort = TRUE) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in product_services field")
Plotting by type
Show the code
ggplot(data = stopwords_removed,
aes(x = type)) +
geom_bar()+
geom_text(stat="count",
aes(label=paste0(after_stat(count))),vjust=-1)+
ylim(0,1500)
Analyze company type
Since from the above, the number of company type is much greater than the other 2 types, we will focus on the company type and find the distribution of revenue_omu
Show the code
clean_nodes_c <-stopwords_removed %>%
drop_na(revenue_omu) %>%
filter(type=="Company")
set.seed(1234)
gghistostats(
data = clean_nodes_c,
x = revenue_omu,
type = "bayes",
test.value = 60,
xlab = "revenue_omu"
)
Most of the companies have a revenue_omu within the first bar, but there are some companies that have far more revenue than others, we select the revenue_omu>400,000
Show the code
df_nodes <- clean_nodes_c %>%
filter(revenue_omu>200000)
df_edges <- mc3_edges %>%
filter(source %in% df_nodes$id)
id3 <- df_edges %>%
select(source) %>%
rename(id = source)
id4 <- df_edges %>%
select(target) %>%
rename(id = target)
df_nodes_1 <- rbind(id3, id4) %>%
distinct() %>%
left_join(mc3_nodes,
unmatched = "drop")
df_graph <- tbl_graph(nodes = df_nodes_1,
edges = df_edges,
directed = FALSE) %>%
mutate(betweenness_centrality = centrality_betweenness(),
closeness_centrality = centrality_closeness())
g <- df_graph %>%
mutate(betweenness_centrality = centrality_betweenness()) %>%
ggraph(layout = "kk") +
geom_edge_link(aes(width=weights),
alpha=0.2) +
scale_edge_width(range = c(0.01, 0.1)) +
geom_node_point(aes(colour = country,
size=betweenness_centrality))
g + theme_graph()
The companies in country ZH seems to have a high revenue_omu but they dont have many business partner records
Grouping
Calculate partner numbers (numbers of targets of a source), and assign partner = -1 if targets dont have a partner record, we only select those with a partner and group them by revenue_omu and partner numbers.
Show the code
df_edges_1 <- mc3_edges %>%
filter(source %in% clean_nodes_c$id)
df_edges_1r <- df_edges_1 %>%
group_by(source) %>%
summarize(partners=n_distinct(target)) %>%
rename(id=source) %>%
ungroup()
df_nodes_2 <- clean_nodes_c %>%
left_join(df_edges_1r) %>%
distinct()
df_nodes_2$partners[is.na(df_nodes_2$partners)] <- -1
df_nodes_2$group[(df_nodes_2$partners>quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 1
df_nodes_2$group[(df_nodes_2$partners>quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 2
df_nodes_2$group[(df_nodes_2$partners<=quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 3
df_nodes_2$group[(df_nodes_2$partners<=quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 4
df_nodes_2$group[(df_nodes_2$partners==-1) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 5
df_nodes_2$group[(df_nodes_2$partners==-1) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 6
df_nodes_2 <- df_nodes_2[,!names(df_nodes_2) %in%
c("word")] %>%
distinct()
set.seed(1234)
gghistostats(
data = df_nodes_2[df_nodes_2$partners>0,],
x = partners,
type = "bayes",
test.value = 60,
xlab = "partners")
visualize the nodes and edges and since Group 5 and 6 dont have partners, they will not appear in the network
Show the code
df_edges_2 <- mc3_edges %>%
filter(source %in% df_nodes_2$id)
id5 <- df_edges_2 %>%
select(source) %>%
rename(id = source)
id6 <- df_edges_2 %>%
select(target) %>%
rename(id = target)
df_nodes_3 <- rbind(id5, id6) %>%
distinct() %>%
left_join(mc3_nodes,
unmatched = "drop") %>%
left_join(df_nodes_2)
df_graph_3 <- tbl_graph(nodes = df_nodes_3,
edges = df_edges_2,
directed = FALSE) %>%
mutate(betweenness_centrality = centrality_betweenness(),
closeness_centrality = centrality_closeness())
g_3 <- df_graph_3 %>%
mutate(betweenness_centrality = centrality_betweenness()) %>%
ggraph(layout = "kk") +
geom_edge_link(aes(width=weights),
alpha=0.2) +
scale_edge_width(range = c(0.01, 0.1)) +
geom_node_point(aes(colour = group,
size=betweenness_centrality))
g_3 + theme_graph()
write_rds(df_nodes_3, "data/df_nodes_3.rds")
write_rds(df_edges_2, "data/df_edges_2.rds")
write_rds(df_graph_3, "data/df_graph_3.rds")